{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "CreateLawSchoolDatasetFiles.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "C9nlVtBd57GW",
        "colab_type": "text"
      },
      "source": [
        "**Copyright 2020 Google LLC.**\n",
        "\n",
        "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
        "\n",
        "https://www.apache.org/licenses/LICENSE-2.0\n",
        "\n",
        "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VpEpTFph2ysp",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from __future__ import division\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import json\n",
        "import os,sys\n",
        "import seaborn as sns\n",
        "import matplotlib.pyplot as plt\n",
        "from sklearn.model_selection import train_test_split\n",
        "import numpy as np"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "F-u1ecNmMiX3",
        "colab_type": "text"
      },
      "source": [
        "## Overview\n",
        "### Pre-process Law School Admissions Council Dataset (LSAC) \n",
        "\n",
        "Download the Law School dataset from: (http://www.seaphe.org/databases.php), convert SAS file to CSV, and save it in the `./group_agnostic_fairness/data/law_school` folder.\n",
        "\n",
        "Input: ./group_agnostic_fairness/data/law_school/lsac.csv\n",
        "\n",
        "Outputs: train.csv, test.csv, mean_std.json, vocabulary.json, IPS_exampleweights_with_label.json, IPS_exampleweights_without_label.json"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oyFyRbFk7zox",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "pd.options.display.float_format = '{:,.2f}'.format\n",
        "dataset_base_dir = './group_agnostic_fairness/data/law_school/'\n",
        "dataset_file_name = 'lsac.csv'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "PgWxzZeyKog3",
        "colab_type": "text"
      },
      "source": [
        "### Processing original dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "chhYs8357xyU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "file_path = os.path.join(dataset_base_dir,dataset_file_name)\n",
        "with open(file_path, \"r\") as file_name:\n",
        "  temp_df = pd.read_csv(file_name)\n",
        "\n",
        "# Columns of interest  \n",
        "df = temp_df[['zfygpa','zgpa','DOB_yr','parttime','gender','race','tier','fam_inc','lsat','ugpa','pass_bar','index6040']].copy()\n",
        "renameColumns={'gender':'sex',\n",
        "               'index6040':'weighted_lsat_ugpa',\n",
        "               'fam_inc':'family_income',\n",
        "               'tier':'cluster_tier',\n",
        "               'parttime':'isPartTime'}\n",
        "target_variable = 'pass_bar'\n",
        "target_value = 'Passed'\n",
        "\n",
        "# Renaming columns\n",
        "df = df.rename(columns = renameColumns)\n",
        "columns = renameColumns.values()\n",
        "\n",
        "# NaN in 'pass_bar' refer to dropouts. Considering NaN as failing the bar.\n",
        "df['pass_bar'] = df['pass_bar'].fillna(value=0.0)\n",
        "df['pass_bar'] = df.apply(lambda x: 'Passed' if x['pass_bar']==1.0 else 'Failed_or_not_attempted', axis=1).astype('category')\n",
        "\n",
        "df['zfygpa'] = df['zfygpa'].fillna(value=0.0)\n",
        "df['zgpa'] = df['zgpa'].fillna(value=0.0)\n",
        "df['DOB_yr'] = df['DOB_yr'].fillna(value=0.0)\n",
        "df = df.dropna()\n",
        "\n",
        "# Binarize target_variable\n",
        "df['isPartTime'] = df.apply(lambda x: 'Yes' if x['isPartTime']==1.0 else 'No', axis=1).astype('category')\n",
        "\n",
        "# Process protected-column values\n",
        "race_dict = {3.0:'Black',7.0:'White'}\n",
        "sex_dict = {'female':'Female','male':'Male'}\n",
        "df['race'] = df.apply(lambda x: race_dict[x['race']] if x['race'] in race_dict.keys() else 'Other', axis=1).astype('category')\n",
        "df['sex'] = df.apply(lambda x: sex_dict[x['sex']] if x['sex'] in sex_dict.keys() else 'Other', axis=1).astype('category')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VUdsXMczzBWe",
        "colab_type": "code",
        "outputId": "d58b81a9-48aa-4145-f44e-bc455601b84e",
        "colab": {
          "height": 204
        }
      },
      "source": [
        "df.head()"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>zfygpa</th>\n",
              "      <th>zgpa</th>\n",
              "      <th>DOB_yr</th>\n",
              "      <th>isPartTime</th>\n",
              "      <th>sex</th>\n",
              "      <th>race</th>\n",
              "      <th>cluster_tier</th>\n",
              "      <th>family_income</th>\n",
              "      <th>lsat</th>\n",
              "      <th>ugpa</th>\n",
              "      <th>pass_bar</th>\n",
              "      <th>weighted_lsat_ugpa</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-1.79</td>\n",
              "      <td>0.00</td>\n",
              "      <td>68.00</td>\n",
              "      <td>No</td>\n",
              "      <td>Female</td>\n",
              "      <td>White</td>\n",
              "      <td>2.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>30.00</td>\n",
              "      <td>3.10</td>\n",
              "      <td>Failed_or_not_attempted</td>\n",
              "      <td>625.79</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1.33</td>\n",
              "      <td>1.88</td>\n",
              "      <td>69.00</td>\n",
              "      <td>No</td>\n",
              "      <td>Female</td>\n",
              "      <td>White</td>\n",
              "      <td>4.00</td>\n",
              "      <td>5.00</td>\n",
              "      <td>44.00</td>\n",
              "      <td>3.50</td>\n",
              "      <td>Passed</td>\n",
              "      <td>886.84</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>-0.11</td>\n",
              "      <td>-0.57</td>\n",
              "      <td>69.00</td>\n",
              "      <td>No</td>\n",
              "      <td>Female</td>\n",
              "      <td>White</td>\n",
              "      <td>2.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>29.00</td>\n",
              "      <td>3.50</td>\n",
              "      <td>Passed</td>\n",
              "      <td>650.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1.22</td>\n",
              "      <td>0.95</td>\n",
              "      <td>58.00</td>\n",
              "      <td>Yes</td>\n",
              "      <td>Female</td>\n",
              "      <td>White</td>\n",
              "      <td>3.00</td>\n",
              "      <td>5.00</td>\n",
              "      <td>35.00</td>\n",
              "      <td>3.00</td>\n",
              "      <td>Failed_or_not_attempted</td>\n",
              "      <td>694.74</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>0.88</td>\n",
              "      <td>0.00</td>\n",
              "      <td>51.00</td>\n",
              "      <td>Yes</td>\n",
              "      <td>Female</td>\n",
              "      <td>White</td>\n",
              "      <td>2.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>39.00</td>\n",
              "      <td>2.90</td>\n",
              "      <td>Failed_or_not_attempted</td>\n",
              "      <td>747.89</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   zfygpa  zgpa  DOB_yr  ... ugpa                 pass_bar weighted_lsat_ugpa\n",
              "0   -1.79  0.00   68.00  ... 3.10  Failed_or_not_attempted             625.79\n",
              "1    1.33  1.88   69.00  ... 3.50                   Passed             886.84\n",
              "2   -0.11 -0.57   69.00  ... 3.50                   Passed             650.00\n",
              "3    1.22  0.95   58.00  ... 3.00  Failed_or_not_attempted             694.74\n",
              "4    0.88  0.00   51.00  ... 2.90  Failed_or_not_attempted             747.89\n",
              "\n",
              "[5 rows x 12 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zKNj_ZV2K_09",
        "colab_type": "text"
      },
      "source": [
        "### Shuffle and Split into Train (70%) and Test set (30%)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0ZLM1kXLz3PI",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_df, test_df = train_test_split(df, test_size=0.30, random_state=42)\n",
        "\n",
        "output_file_path = os.path.join(dataset_base_dir,'train.csv')\n",
        "with open(output_file_path, mode=\"w\") as output_file:\n",
        "    train_df.to_csv(output_file,index=False,columns=columns,header=False)\n",
        "    output_file.close()\n",
        "\n",
        "output_file_path = os.path.join(dataset_base_dir,'test.csv')\n",
        "with open(output_file_path, mode=\"w\") as output_file:\n",
        "    test_df.to_csv(output_file,index=False,columns=columns,header=False)\n",
        "    output_file.close()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1VQE85STLL46",
        "colab_type": "text"
      },
      "source": [
        "### Computing Invese propensity weights for each subgroup, and writes to directory.\n",
        "\n",
        "IPS_example_weights_with_label.json: json dictionary of the format\n",
        "        {subgroup_id : inverse_propensity_score,...}. Used by IPS_reweighting_model approach."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2fkieHul02TL",
        "colab_type": "code",
        "outputId": "2e58832f-48e6-4d45-940d-e6c04f418c92",
        "colab": {
          "height": 34
        }
      },
      "source": [
        "IPS_example_weights_without_label = {\n",
        "  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')])), # 00: White Male\n",
        "  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')])), # 01: White Female\n",
        "  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')])), # 10: Black Male\n",
        "  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]))  # 11: Black Female\n",
        "}\n",
        "  \n",
        "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_without_label.json')\n",
        "with open(output_file_path, mode=\"w\") as output_file:\n",
        "    output_file.write(json.dumps(IPS_example_weights_without_label))\n",
        "    output_file.close()\n",
        "\n",
        "print(IPS_example_weights_without_label)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{0: 1.8843151171043293, 1: 2.488618103910016, 2: 36.36986301369863, 3: 25.013458950201883}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Dm15uo-R0-LB",
        "colab_type": "code",
        "outputId": "0c59cab3-3436-4fd2-c4f7-8cda8d0ba756",
        "colab": {
          "height": 34
        }
      },
      "source": [
        "IPS_example_weights_with_label = {\n",
        "0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 000: Negative White Male\n",
        "1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 001: Negative White Female\n",
        "2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 010: Negative Black Male\n",
        "3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 011: Negative Black Female\n",
        "4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 100: Positive White Male\n",
        "5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 101: Positive White Female\n",
        "6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 110: Positive Black Male\n",
        "7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 111: Positive Black Female\n",
        "}\n",
        "  \n",
        "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_with_label.json')\n",
        "with open(output_file_path, mode=\"w\") as output_file:\n",
        "    output_file.write(json.dumps(IPS_example_weights_with_label))\n",
        "    output_file.close()\n",
        "\n",
        "print(IPS_example_weights_with_label)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{0: 10.194733955019199, 1: 13.545918367346939, 2: 82.23451327433628, 3: 63.214285714285715, 4: 2.3115671641791047, 5: 3.048720472440945, 6: 65.21052631578948, 7: 41.39198218262806}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8SQc7h9HLcSc",
        "colab_type": "text"
      },
      "source": [
        "### Construct vocabulary.json, and write to directory.\n",
        "\n",
        "vocabulary.json: json dictionary of the format {feature_name:      [feature_vocabulary]}, containing vocabulary for categorical features."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YIebJG2YfMpv",
        "colab_type": "code",
        "outputId": "d22853e7-576b-416a-df26-db7faedf1670",
        "colab": {
          "height": 34
        }
      },
      "source": [
        "cat_cols = train_df.select_dtypes(include='category').columns\n",
        "vocab_dict = {}\n",
        "for col in cat_cols:\n",
        "  vocab_dict[col] = list(set(train_df[col].cat.categories))\n",
        "  \n",
        "output_file_path = os.path.join(dataset_base_dir,'vocabulary.json')\n",
        "with open(output_file_path, mode=\"w\") as output_file:\n",
        "    output_file.write(json.dumps(vocab_dict))\n",
        "    output_file.close()\n",
        "print(vocab_dict)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{'isPartTime': ['No', 'Yes'], 'sex': ['Female', 'Male'], 'race': ['Black', 'White', 'Other'], 'pass_bar': ['Passed', 'Failed_or_not_attempted']}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "V9cxiG9SLfk6",
        "colab_type": "text"
      },
      "source": [
        "### Construct mean_std.json, and write to directory\n",
        "\n",
        "mean_std.json: json dictionary of the format feature_name: [mean, std]},\n",
        "containing mean and std for numerical features. "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "URN20fCpFcdi",
        "colab_type": "code",
        "outputId": "d6969466-4c09-4d7a-94b9-4a6b85d5d0b5",
        "colab": {
          "height": 54
        }
      },
      "source": [
        "temp_dict = train_df.describe().to_dict()\n",
        "mean_std_dict = {}\n",
        "for key, value in temp_dict.items():\n",
        "  mean_std_dict[key] = [value['mean'],value['std']]\n",
        "\n",
        "output_file_path = os.path.join(dataset_base_dir,'mean_std.json')\n",
        "with open(output_file_path, mode=\"w\") as output_file:\n",
        "    output_file.write(json.dumps(mean_std_dict))\n",
        "    output_file.close()\n",
        "print(mean_std_dict)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{'zfygpa': [0.007156308851224107, 0.956269325542025], 'zgpa': [0.005135324186171643, 0.9203686714713514], 'DOB_yr': [64.9954802259887, 6.374190672837983], 'cluster_tier': [3.7390906645143933, 1.183449020338574], 'family_income': [3.4257734732311005, 0.8794618881913022], 'lsat': [36.57297820823245, 5.629890085895137], 'ugpa': [3.2242292171105733, 0.41846631192390027], 'weighted_lsat_ugpa': [741.9962436595317, 107.69097610619035]}\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}